Importing libraries¶

In [1]:
from io import StringIO
import json
import os

import cv2
import dash
from dash import dash_table
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
import plotly.express as px
import plotly.graph_objects as go
import pytesseract
from pytesseract import Output as Output1
from skimage import data

Setting default display for pandas Dataframes¶

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

Reading image¶

In [3]:
fileName = os.path.abspath(".\\ADMIN1.jpg")
img = cv2.imread(fileName)

img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

Applying Tesseract OCR to image¶

In [5]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
d = pytesseract.image_to_data(img, output_type=Output1.DICT)

Processing the DataFrame with the dictionary source¶

In [6]:
dfCoord = pd.DataFrame.from_dict(d)
dfCoord = dfCoord[dfCoord["conf"] != "-1"]
dfCoord = dfCoord.drop(["level", "page_num"], axis=1)
dfCoord = dfCoord[dfCoord["text"].apply(lambda x: x.strip()) != ""]
dfCoord = dfCoord.reset_index(drop=True)
In [8]:
dfCoord
Out[8]:
block_num par_num line_num word_num left top width height conf text
0 2 1 1 1 772 253 116 67 96.262962 City
1 2 1 1 2 907 253 58 53 96.516953 of
2 2 1 1 3 984 254 181 66 96.516953 Sugar
3 2 1 1 4 1187 253 143 53 96.221062 Land
4 2 1 2 1 773 334 94 53 96.756058 PO
5 2 1 2 2 893 334 112 53 96.736855 Box
6 2 1 2 3 1026 334 150 53 95.912483 5029
7 2 1 3 1 772 412 183 68 96.548904 Sugar
8 2 1 3 2 976 412 161 62 95.715347 Land,
9 2 1 3 3 1162 412 86 54 95.531441 TX
10 2 1 3 4 1272 411 369 55 96.337097 77487-5029
11 5 1 1 1 1870 215 246 54 96.657501 Contact:
12 5 2 1 1 1870 293 155 67 94.731590 (281)
13 5 2 1 2 2047 294 289 53 95.970184 275-2750
14 5 2 2 1 1871 374 239 66 93.296806 Monday
15 5 2 2 2 2129 404 21 7 93.024345 -
16 5 2 2 3 2174 373 189 67 96.272079 Friday
17 5 2 3 1 1870 452 255 55 91.539734 8:00AM-
18 5 2 3 2 2147 451 233 57 96.357254 5:00PM
19 6 1 1 1 3480 179 376 61 96.346367 Customer
20 6 1 1 2 3880 179 192 77 96.650040 Copy
21 7 1 1 1 301 624 325 43 96.015770 CUSTOMER
22 7 1 1 2 647 624 156 42 96.151947 NAME
23 7 1 1 3 1520 622 317 43 96.419357 CUSTOMER
24 7 1 1 4 1857 622 92 42 96.761429 NO.
25 7 1 1 5 2560 621 218 42 94.474831 PARCEL
26 7 1 1 6 2798 621 50 41 94.474831 ID
27 7 1 1 7 3580 618 241 43 96.516258 SERVICE
28 7 1 1 8 3842 618 278 42 96.695045 LOCATION
29 7 1 2 1 301 747 154 43 96.600708 FORT
30 7 1 2 2 474 747 153 43 95.269295 BEND
31 7 1 2 3 649 746 87 43 95.269295 ISD
32 7 1 2 4 1665 745 145 43 96.444336 19057
33 7 1 2 5 3476 742 145 42 96.451721 16431
34 7 1 2 6 3648 740 313 44 95.787308 LEXINGTON
35 7 1 2 7 3982 741 139 42 95.833969 BLVD
36 7 1 3 1 304 874 122 41 95.757881 BILL
37 7 1 3 2 449 873 246 43 96.369598 NUMBER
38 7 1 3 3 1302 873 121 42 96.681633 BILL
39 7 1 3 4 1445 873 141 42 96.085312 DATE
40 7 1 3 5 2058 870 272 43 0.000000 P\0070)
41 7 1 3 6 2232 864 34 65 54.930416 0)
42 7 1 3 7 2276 864 28 65 0.000000 \h
43 7 1 3 8 2324 864 23 65 25.574455 me
44 7 1 3 9 2349 870 31 43 60.154186 3
45 7 1 3 10 2777 869 265 43 95.912941 ACCOUNT
46 7 1 3 11 3059 869 140 42 96.607018 TYPE
47 7 1 3 12 3850 867 108 42 96.227783 DUE
48 7 1 3 13 3980 867 140 42 96.765762 DATE
49 7 1 4 1 300 996 188 42 96.659859 209740
50 7 1 4 2 1304 993 279 43 96.648819 01/14/2022
51 7 1 4 3 2140 993 153 42 96.638290 29336
52 7 1 4 4 2787 985 404 65 96.058197 INSTITUTIONAL
53 7 1 4 5 3850 989 271 43 96.020142 02/04/2022
54 7 1 5 1 1442 1095 194 42 95.901024 METER
55 7 1 5 2 1775 1094 148 41 95.866302 READ
56 7 1 5 3 1986 1093 281 43 96.328667 PREVIOUS
57 7 1 5 4 2342 1093 272 43 95.932999 CURRENT
58 7 1 5 5 2686 1092 275 42 95.596924 PREVIOUS
59 7 1 5 6 3090 1091 267 43 96.608841 CURRENT
60 7 1 6 1 294 1124 383 42 47.414356 PSN
61 7 1 6 2 1415 1160 244 43 57.210556 NUMBER
62 7 1 6 3 1708 1153 56 65 31.729263 __
63 7 1 6 4 1770 1160 349 42 68.246956 CODE_READ
64 7 1 6 5 2141 1159 142 42 95.951340 DATE
65 7 1 6 6 2405 1159 150 42 96.799988 READ
66 7 1 6 7 2702 1158 243 42 96.157440 READING
67 7 1 6 8 3100 1157 247 43 96.281128 READING
68 7 1 6 9 3499 1118 195 43 24.862762 Pee
69 7 1 7 1 289 1244 195 43 96.112709 WATER
70 7 1 7 2 498 1240 222 47 96.248344 VOLUME
71 7 1 7 3 1419 1243 234 45 96.510948 70359619
72 7 1 7 4 1829 1243 37 41 95.959351 A
73 7 1 7 5 2001 1240 259 46 63.228813 12/10/2021
74 7 1 7 6 2346 1241 261 44 68.635910 01/11/2022
75 7 1 7 7 2743 1241 149 42 91.939758 50680
76 7 1 7 8 3152 1240 141 43 96.927795 51076
77 7 1 7 9 3525 1239 146 43 96.373451 39600
78 7 1 7 10 3958 1236 155 49 96.202789 $55.84
79 7 1 8 1 289 1320 194 67 95.794609 WATER
80 7 1 8 2 501 1320 153 67 24.921249 BASE.
81 7 1 8 3 3958 1310 155 50 96.890190 $80.60
82 7 1 9 1 289 1396 191 44 96.565781 WASTE
83 7 1 9 2 496 1395 196 43 96.508591 WATER
84 7 1 9 3 703 1385 217 54 96.709206 VOLUME
85 7 1 9 4 3920 1382 194 58 92.235397 $125.53
86 7 1 10 1 289 1463 177 52 95.924202 WASTE
87 7 1 10 2 500 1459 153 70 96.754730 WATER
88 7 1 10 3 706 1470 133 50 96.777153 BASE
89 7 1 10 4 3920 1459 189 58 92.410278 $107.31
90 7 1 11 1 291 1542 247 48 96.224167 SURFACE
91 7 1 11 2 572 1539 182 58 96.845490 WATER
92 7 1 11 3 3928 1538 180 50 95.955276 $127.91
93 7 1 12 1 296 2454 164 38 32.709816 250000
94 7 1 12 2 2036 2401 176 88 96.087906 READ
95 7 1 12 3 2248 2401 177 88 96.536949 CODE
96 7 1 12 4 2749 2427 124 44 96.723923 Want
97 7 1 12 5 2889 2429 44 41 96.877914 to
98 7 1 12 6 2950 2427 173 43 96.730400 receive
99 7 1 12 7 3142 2437 106 45 96.114182 your
100 7 1 12 8 3263 2428 140 42 96.768059 water
101 7 1 12 9 3419 2426 64 43 96.643692 bill
102 7 1 12 10 3500 2425 353 55 96.643692 electronically?
103 7 1 12 11 3873 2424 28 43 95.861336 It
104 7 1 12 12 3917 2425 36 44 96.322678 is
105 7 1 12 13 3970 2436 56 32 96.322678 an
106 7 1 13 1 296 2520 164 37 54.118515 200000
107 7 1 13 2 2043 2482 95 48 96.202499 A=
108 7 1 13 3 2160 2482 187 48 96.262276 Actual
109 7 1 13 4 2750 2500 111 44 96.765495 easy
110 7 1 13 5 2878 2489 85 44 96.283546 and
111 7 1 13 6 2983 2489 265 43 96.187454 convenient
112 7 1 13 7 3263 2499 100 44 96.874557 way
113 7 1 13 8 3379 2490 46 41 96.997391 to
114 7 1 13 9 3444 2488 179 44 96.758728 receive
115 7 1 13 10 3640 2499 109 43 97.011078 your
116 7 1 13 11 3763 2489 136 42 96.877388 water
117 7 1 13 12 3914 2487 62 45 96.239418 bill
118 7 1 14 1 2046 2557 35 48 91.252274 E
119 7 1 14 2 2104 2570 33 23 95.296852 =
120 7 1 14 3 2159 2557 260 48 95.296852 Estimate
121 7 1 14 4 2750 2553 112 44 96.485695 each
122 7 1 14 5 2882 2552 161 44 95.506302 month.
123 7 1 14 6 3064 2553 164 43 96.403297 Please
124 7 1 14 7 3246 2552 97 43 93.276779 visit
125 7 1 14 8 3360 2552 394 53 91.394714 sugarlandtx.gov
126 7 1 14 9 3772 2552 86 42 96.407860 and
127 7 1 14 10 3878 2551 218 54 96.407860 complete
128 7 1 15 1 299 2607 161 38 45.470829 450000
129 7 1 15 2 2046 2632 32 47 34.579712 Fe
130 7 1 15 3 2100 2644 33 24 61.350292 =
131 7 1 15 4 2155 2632 142 47 61.350292 Final
132 7 1 15 5 2749 2615 73 43 93.235832 the
133 7 1 15 6 2839 2614 111 44 88.623756 e-bill
134 7 1 15 7 2968 2614 97 54 95.468056 sign
135 7 1 15 8 3085 2625 58 42 95.468056 up
136 7 1 15 9 3160 2614 122 43 96.484940 form.
137 7 1 15 10 3305 2614 46 42 96.484940 IF
138 7 1 15 11 3368 2612 117 44 96.522202 YOU
139 7 1 15 12 3503 2613 117 43 96.046005 ARE
140 7 1 15 13 3640 2612 292 44 96.408096 ENROLLED
141 7 1 15 14 3953 2613 46 42 96.915741 IN
142 7 1 16 1 299 2696 161 37 95.253807 100000
143 7 1 16 2 2750 2678 198 43 95.299538 CREDIT
144 7 1 16 3 2966 2677 151 43 95.827713 CARD
145 7 1 16 4 3135 2677 152 43 93.078506 AUTO
146 7 1 16 5 3307 2676 128 43 93.078506 PAY-
147 7 1 16 6 3454 2677 79 43 96.955223 DO
148 7 1 16 7 3554 2676 116 43 96.155655 NOT
149 7 1 16 8 3690 2676 108 43 96.403831 PAY
150 7 1 17 1 352 2787 110 38 0.000000 BHD
151 7 1 17 2 513 2721 3 63 90.360794 |
152 7 1 17 3 2752 2739 197 45 96.298599 UTILITY
153 7 1 17 4 2967 2739 141 44 95.374298 RATE
154 7 1 17 5 3127 2738 273 45 95.660034 CHANGES
155 7 1 17 6 3420 2737 306 45 95.660034 EFFECTIVE
156 7 1 17 7 3743 2737 253 45 96.513329 JANUARY
157 7 1 17 8 4017 2738 36 50 96.847549 1,
158 7 1 18 1 324 2787 80 38 73.454102 200
159 7 1 18 2 2750 2804 117 43 95.473251 2022
160 7 1 18 3 2887 2804 110 43 95.473251 FOR
161 7 1 18 4 3016 2804 160 42 96.602379 MORE
162 7 1 18 5 3198 2802 386 44 96.589233 INFORMATION
163 7 1 18 6 3607 2802 211 43 96.762817 PLEASE
164 7 1 18 7 3835 2802 146 44 95.672592 VISIT:
165 7 1 19 1 434 2852 26 37 61.679798 9
166 7 1 19 2 2749 2863 1047 45 90.131554 WWW.SUGARLANDTX.GOV/2022RATES
167 8 1 1 1 582 2916 72 33 96.403938 Feb
168 8 1 1 2 683 2916 70 34 96.674660 Mar
169 8 1 1 3 784 2917 65 40 96.458382 Apr
170 8 1 1 4 879 2916 76 42 96.590973 May
171 8 1 1 5 982 2917 64 31 96.187035 Jun
172 8 1 1 6 1088 2915 52 33 95.815765 Jul
173 8 1 1 7 1175 2916 72 41 96.008186 Aug
174 8 1 1 8 1276 2916 71 40 96.135269 Sep
175 8 1 1 9 1380 2916 65 32 96.378448 Oct
176 8 1 1 10 1478 2916 72 31 96.549416 Nov
177 8 1 1 11 1577 2915 73 33 96.731247 Dec
178 8 1 1 12 1680 2916 65 32 96.731247 Jan
179 9 1 1 1 2777 3162 179 38 96.215485 Previous
180 9 1 1 2 2972 3162 168 38 96.396736 Balance
181 9 1 1 3 3967 3156 118 44 95.666824 $0.00
182 12 1 1 1 2775 3288 102 38 96.571953 Total
183 12 1 1 2 2895 3288 154 38 96.828270 Current
184 12 1 1 3 3064 3287 124 48 95.764877 Billing
185 12 1 1 4 3903 3280 190 54 0.000000 '$497:19)
186 13 1 1 1 4101 3360 5 10 0.000000 ")
187 14 1 1 1 2776 3413 109 38 96.016525 Total
188 14 1 1 2 2902 3413 176 37 96.468315 Amount
189 14 1 1 3 3093 3413 87 37 96.375946 Due
190 14 1 1 4 3918 3407 167 43 94.438271 $497.19
191 15 1 1 1 1887 3672 195 38 96.420074 Domestic
192 15 1 1 2 2100 3674 122 36 96.583992 meter
193 15 1 1 3 2237 3672 273 47 96.201927 consumption
194 15 1 1 4 2527 3672 138 46 96.610115 history
195 15 1 1 5 2679 3670 94 48 96.423248 only.
196 16 1 1 1 678 4094 94 54 96.485863 City
197 16 1 1 2 789 4094 47 43 96.308273 of
198 16 1 1 3 852 4094 148 54 96.654488 Sugar
199 16 1 1 4 1017 4094 119 43 96.888031 Land
200 16 1 2 1 679 4157 76 43 96.305237 PO
201 16 1 2 2 776 4158 91 42 95.918327 Box
202 16 1 2 3 886 4157 119 43 96.490356 5029
203 16 1 3 1 678 4222 148 53 96.506783 Sugar
204 16 1 3 2 844 4222 133 49 95.940292 Land,
205 16 1 3 3 997 4222 68 42 96.813469 TX
206 16 1 3 4 1083 4222 291 43 96.769180 77487-5029
207 18 1 1 1 307 4597 386 34 43.464603 **AUTOX#MIXED
208 18 1 1 2 731 4597 108 34 95.087036 AADC
209 18 1 1 3 877 4596 81 35 90.976006 750
210 18 1 1 4 995 4598 20 31 92.108597 4
211 18 1 1 5 1054 4597 109 34 89.913353 MAAD
212 18 1 1 6 1202 4595 405 36 0.000000 L29238ACL3I-A~1
213 18 1 2 1 306 4647 81 35 71.101959 4616
214 18 1 2 2 427 4647 22 34 86.746216 1
215 18 1 2 3 487 4647 54 34 86.746216 MB
216 18 1 2 4 579 4647 143 34 59.681145 O-48e2
217 19 1 1 1 305 4720 274 76 0.000000 MOTTE
218 19 1 1 2 673 4720 57 75 14.101189 ba
219 19 1 1 3 755 4720 58 75 18.686554 TR
220 19 1 1 4 918 4720 86 75 7.685303 UE
221 19 1 2 1 304 4835 154 43 96.388054 FORT
222 19 1 2 2 477 4834 155 43 95.445778 BEND
223 19 1 2 3 656 4834 86 43 89.724815 ISD
224 20 1 1 1 307 4895 152 42 96.402084 13600
225 20 1 1 2 480 4894 245 43 95.895111 MURPHY
226 20 1 1 3 744 4894 75 43 95.895111 RD
227 20 1 2 1 304 4957 305 43 95.212059 STAFFORD
228 20 1 2 2 629 4956 71 44 95.212059 TX
229 20 1 2 3 734 4956 293 44 96.490189 77477-4908
230 21 1 1 1 420 5222 140 38 96.476814 Check
231 21 1 1 2 579 5222 96 38 96.732162 here
232 21 1 1 3 691 5222 56 37 96.451721 for
233 21 1 1 4 762 5222 154 48 96.224670 change
234 21 1 1 5 931 5222 42 37 96.213905 of
235 21 1 1 6 987 5222 120 46 96.213905 billing
236 21 1 1 7 1124 5222 166 38 96.641182 address
237 21 1 1 8 1306 5221 87 48 96.434471 only
238 21 1 1 9 1408 5222 78 36 96.473549 and
239 21 1 1 10 1504 5223 91 35 96.720726 note
240 21 1 1 11 1610 5222 180 46 96.267128 changes
241 21 1 1 12 1806 5230 49 28 96.448151 on
242 21 1 2 1 421 5279 116 37 96.765495 back.
243 22 1 1 1 294 5344 108 111 60.039272 [|
244 22 1 1 2 419 5388 221 38 96.438416 Additional
245 22 1 1 3 659 5388 116 47 96.720581 Utility
246 22 1 1 4 786 5388 234 38 96.416985 Assistance
247 22 1 1 5 1036 5388 202 37 91.624390 Donation:
248 22 1 1 6 1256 5385 23 44 91.624390 $
249 23 1 1 1 2770 3753 97 28 0.000000 ———_e
250 23 1 1 2 2883 3766 73 6 27.435387 ewe
251 23 1 1 3 2973 3765 130 7 20.407097 wee
252 23 1 1 4 3195 3765 206 7 1.422691 we
253 23 1 1 5 3494 3765 94 6 14.973640 EE
254 23 1 1 6 3634 3764 72 7 21.509354 EH
255 23 1 1 7 3736 3764 90 6 24.155441 ee
256 24 1 1 1 3217 3807 215 53 96.195526 REMIT
257 24 1 1 2 3457 3805 311 55 96.309532 PORTION
258 25 1 1 1 3102 3920 210 39 95.951157 SERVICE
259 25 1 1 2 3330 3920 249 38 96.548164 LOCATION
260 26 1 1 1 3057 4033 123 38 96.122055 16431
261 26 1 1 2 3203 4032 278 39 95.631500 LEXINGTON
262 26 1 1 3 3500 4032 125 38 96.544754 BLVD
263 28 1 1 1 2665 4151 102 36 95.729660 BILL
264 28 1 1 2 2783 4150 204 37 95.729660 NUMBER
265 28 1 1 3 3182 4149 278 37 89.178780 CUSTOMER
266 28 1 1 4 3475 4149 26 37 89.178780 #
267 28 1 1 5 3727 4147 236 37 91.348328 ACCOUNT
268 28 1 1 6 3977 4147 25 36 91.348328 #
269 29 1 1 1 3275 4262 133 39 92.569412 19057
270 29 1 1 2 3798 4261 131 39 96.894951 29336
271 30 1 1 1 2748 4264 154 39 96.647865 209740
272 32 1 1 1 2710 4382 95 37 96.060104 DUE
273 32 1 1 2 2821 4381 123 37 96.515785 DATE
274 32 1 1 3 3131 4381 159 37 96.452881 AFTER
275 32 1 1 4 3307 4380 99 38 96.923332 DUE
276 32 1 1 5 3425 4380 126 37 96.922447 DATE
277 32 1 1 6 3730 4379 156 37 95.882820 TOTAL
278 32 1 1 7 3903 4378 95 37 96.855499 DUE
279 33 1 1 1 3254 4492 175 43 65.122849 $546.90
280 34 1 1 1 3780 4490 170 45 96.804626 $497.19
281 35 1 1 1 2709 4496 233 38 96.605537 02/04/2022
282 36 1 1 1 2382 4654 191 46 96.497505 Promptly
283 36 1 1 2 2588 4653 107 39 96.234222 Send
284 36 1 1 3 2710 4653 65 38 96.238342 To:
285 37 1 1 1 2527 4795 221 77 29.688271 UTR
286 37 1 1 2 2910 4795 164 77 22.503410 ee
287 37 1 1 3 3084 4794 34 52 30.895905 od
288 37 1 1 4 3158 4794 106 77 44.220432 TT
289 37 1 1 5 3313 4793 202 77 11.338158 PTT
290 37 1 1 6 3569 4794 37 75 30.251465 bod
291 37 1 1 7 3693 4793 60 77 27.943359 |
292 37 1 1 8 3778 4817 38 53 37.709404 oo
293 37 1 2 1 2530 4915 123 45 96.562340 CITY
294 37 1 2 2 2672 4915 74 43 96.466965 OF
295 37 1 2 3 2764 4914 188 44 95.701492 SUGAR
296 37 1 2 4 2972 4915 141 43 95.701492 LAND
297 38 1 1 1 2531 4980 77 42 95.579689 PO
298 38 1 1 2 2628 4978 112 43 95.579689 BOX
299 38 1 1 3 2759 4978 114 42 96.861961 5029
300 38 1 2 1 2530 5041 195 43 96.518585 SUGAR
301 38 1 2 2 2745 5040 137 43 95.475922 LAND
302 38 1 2 3 2901 5040 67 43 96.470901 TX
303 38 1 2 4 2987 5038 289 45 96.747757 77487-5029
304 39 1 1 1 2392 5311 180 41 93.200645 Please
305 39 1 1 2 2592 5311 189 41 93.200645 include
306 39 1 1 3 2800 5310 118 50 95.602089 both
307 39 1 1 4 2935 5311 79 40 95.602089 the
308 39 1 1 5 3035 5310 260 41 93.263229 Customer
309 39 1 1 6 3311 5310 28 40 90.964043 #
310 39 1 1 7 3359 5310 96 40 95.978844 and
311 39 1 1 8 3475 5309 229 41 69.688736 Account
312 39 1 1 9 3713 5304 20 60 69.688736 #
313 39 1 1 10 3765 5318 60 30 95.892525 on
314 39 1 1 11 3845 5318 115 42 95.892525 your
315 39 1 2 1 2390 5375 164 40 96.599167 check
316 39 1 2 2 2575 5374 170 41 95.206253 and/or
317 39 1 2 3 2762 5374 164 41 96.122650 Online
318 39 1 2 4 2945 5373 75 42 95.213760 Bill
319 39 1 2 5 3043 5374 94 51 96.606789 Pay
320 40 1 1 1 4177 3455 48 228 63.275684 SLAI22621PT
321 40 1 1 2 4188 3359 27 84 93.171921 16250
322 40 1 1 3 4201 3330 6 12 87.170624 -
323 40 1 1 4 4187 2990 27 326 85.283363 129238AC13.A.1.818
324 40 1 1 5 4185 2835 28 142 80.271667 3.8.0.482
325 40 1 1 6 4200 2812 5 12 91.215714 -
326 40 1 1 7 4181 2728 32 75 85.840286 www
327 40 1 1 8 4186 2564 32 153 86.481316 dataprose
328 40 1 1 9 4192 2488 20 64 96.951462 com

Build the requested structure for the data¶

In [9]:
prev_row = dfCoord.iloc[0]["word_num"]
line = dfCoord.iloc[0]["text"] + " "
min_x, min_y = dfCoord.iloc[0]["left"], dfCoord.iloc[0]["top"]
max_x, max_y = (
    dfCoord.iloc[0]["left"] + dfCoord.iloc[0]["width"],
    dfCoord.iloc[0]["top"] + dfCoord.iloc[0]["height"],
)
pre_row = dfCoord.iloc[0]["left"]

preconfig = (
    True,
    "x",
    "y",
    "above",
    1,
    {"color": "red", "width": 1, "dash": "solid"},
    "rgba(0,0,0,0)",
    "evenodd",
    "rect",
)

lines = []

for index, row in dfCoord.iterrows():

    if not index:
        continue

    if float(row["conf"]) < 50:
        continue

    if row["word_num"] > prev_row:
        if row["left"] - pre_row < 370:  # Mejorar este número
            line += row["text"] + " "
            min_x = min(min_x, row["left"])
            min_y = min(min_y, row["top"])
            max_x = max(max_x, row["left"] + row["width"])
            max_y = max(max_y, row["top"] + row["height"])
        else:
            lines.append((*preconfig, min_x, min_y, max_x, max_y, line[:-1] + "\n"))
            line = row["text"] + " "
            min_x, min_y = row["left"], row["top"]
            max_x, max_y = row["left"] + row["width"], row["top"] + row["height"]
    else:
        lines.append((*preconfig, min_x, min_y, max_x, max_y, line[:-1] + "\n"))
        line = row["text"] + " "
        min_x, min_y = row["left"], row["top"]
        max_x, max_y = row["left"] + row["width"], row["top"] + row["height"]

    prev_row = row["word_num"]
    pre_row = row["left"]

Convert the list of tuples to DataFrame¶

In [10]:
df_out = pd.DataFrame(
    lines,
    columns=[
        "editable",
        "xref",
        "yref",
        "layer",
        "opacity",
        "line",
        "fillcolor",
        "fillrule",
        "type",
        "x0",
        "y0",
        "x1",
        "y1",
        "text",
    ],
)

Create de figure for Plotly visualization with the image¶

In [11]:
fig = px.imshow(img)

Adding all the boxes for every word/phrase/sentence founded with OCR¶

In [12]:
for index, row in df_out.iterrows():
    fig.add_shape(
        type=row["type"],
        xref=row["xref"],
        yref=row["yref"],
        x0=row["x0"],
        x1=row["x1"],
        y0=row["y0"],
        y1=row["y1"],
        line=row["line"],
    )

Adding the feature to insert manually a box¶

In [13]:
fig.update_layout(
    dragmode="drawrect",
    newshape=dict(line=dict(color="red", width=1)),
)
fig.update_layout(margin={"l": 0, "r": 0, "t": 0, "b": 0})

Configure the Dash Application and adding the figure¶

In [14]:
config = {
    "modeBarButtonsToAdd": [
        # "drawline",
        # "drawopenpath",
        # "drawclosedpath",
        # "drawcircle",
        "drawrect",
        "eraseshape",
    ]
}

# Build App
app = dash.Dash(__name__)
app.layout = html.Div(
    [
        html.H4("Draw a shape, then modify it"),
        dcc.Graph(
            id="fig-image",
            figure=fig,
            config=config,
            style={"width": "150vh", "height": "150vh", "border": "1px black solid"},
        ),
        dcc.Markdown("Characteristics of shapes"),
        html.Pre(id="annotations-pre"),
    ]
)


@app.callback(
    Output("annotations-pre", "children"),
    # Output('canvaas-table', 'data'),
    Input("fig-image", "relayoutData"),
    prevent_initial_call=True,
)
def on_new_annotation(string):
    # for key in relayout_data:
    if "shapes" in string:
        print(string)
        data = string["shapes"]
        print(data)
        data = pd.DataFrame.from_dict(data)
        print(data)

        data2 = pd.DataFrame()
        ReadingSection = pd.DataFrame()
        for index, row in data.iterrows():
            y1 = int(row["y0"])
            y2 = int(row["y1"])
            x1 = int(row["x0"])
            x2 = int(row["x1"])
            ReadingSection = img[y1:y2, x1:x2]
            text = pytesseract.image_to_string(ReadingSection, config="--psm 6")
            dfReadingSection = pd.DataFrame(StringIO(text))
            data2 = data2.append(dfReadingSection)
            print(data2)
        data2 = data2.to_dict(orient="records")
        return json.dumps(data2, indent=2)
    return dash.no_update


if __name__ == "__main__":
    app.run_server(debug=True)
Dash is running on http://127.0.0.1:8050/

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
   WARNING: This is a development server. Do not use it in a production deployment.
   Use a production WSGI server instead.
 * Debug mode: on
An exception has occurred, use %tb to see the full traceback.

SystemExit: 1
C:\Users\tw\anaconda3\envs\test02-py39\lib\site-packages\IPython\core\interactiveshell.py:3405: UserWarning:

To exit: use 'exit', 'quit', or Ctrl-D.

In [ ]: